## Input for CICERO is coverage levels of each deduplicated ATAC-Seq library over HARs.

## Note that fasta files has to be indexed and a dictonary of it created ("Genome.fa below"). In the dictionary, the Chromosomes have to be ordered lexicographically. Ensure that ChrM is not included in the dictionary.

## Ensure removal of non-linear sequences from HARs.bed (Especially Chr17 for humans and ChrX for Mouse).

## To get this input we processed the scATAC-Seq libraries as follows:


java -Xmx20g -jar /Path/To/picard.jar AddOrReplaceReadGroups INPUT=/Path/To/Deduplicated.bam OUTPUT=/Path/To/Labelled_Library.bam RGID=<Group Name> RGLB=<Name of Library> RGPL=illumina RGPU=<barcode of library> RGSM=<name of library>

java -Xmx20g -jar /Path/To/picard.jar ReorderSam INPUT=/Path/To/<Labelled_Deduplicated.bam> OUTPUT=/Path/To/Reordered/Library.bam REFERENCE=/Path/To/Genome.fa ALLOW_INCOMPLETE_DICT_CONCORDANCE=true

samtools index /OUTPUT=/Path/To/Reordered/Library.bam

java -jar /Path/To/GenomeAnalysisTK-3.4-46/GenomeAnalysisTK.jar -T DepthOfCoverage -R ~/Genomes/hg19_lex/hg19.fasta -o /Path/To/Output/Name -I /Path/To/bams.list --countType COUNT_FRAGMENTS -L /Path/To/HARs.bed --filter_reads_with_N_cigar

## The values indicated in the total_cvg columns of the interval_summary outputs of GATK were used for downstream CICERO analysis.

library(cicero)
data = read.table("HARs.txt", sep="\t", header=TRUE)  ##"HARs.txt" is the output of GATK as explained in the above point.
data2 = stack(data)
data2$regions <- rep(data[,1])
data3 <- data2[,c(3,2,1)]
names(data3) <- NULL
head(data3)
input_cds <- make_atac_cds(data3, binarize = TRUE)
set.seed(2017)
input_cds <- detectGenes(input_cds)
input_cds <- estimateSizeFactors(input_cds)
input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6, reduction_method = 'tSNE', norm_method = "none",perplexity=7)
tsne_coords <- t(reducedDimA(input_cds))
row.names(tsne_coords) <- row.names(pData(input_cds))
cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
mm9 <- read.table("chrNameLength.txt", sep="\t")
conns <- run_cicero(cicero_cds, mm9)
genes=read.table('mm9_Genes.bed',sep='\t',header=TRUE,stringsAsFactors=FALSE)
CCAN_assigns <- generate_ccans(conns)
gene_annotation_sub <-genes[,c(1:3,8)]
names(gene_annotation_sub)[4] <- "gene"
input_cds <- annotate_cds_by_site(input_cds, gene_annotation_sub)
head(fData(input_cds))
unnorm_ga <- build_gene_activity_matrix(input_cds, conns)
num_genes <- pData(input_cds)$num_genes_expressed
names(num_genes) <- row.names(pData(input_cds))
cicero_gene_activities <- normalize_gene_activities(unnorm_ga, num_genes)

# generate fake second unnormalized gene activity matrix
#unnorm_ga2 <- build_gene_activity_matrix(input_cds, conns)                    
           
# if you had two datasets to normalize, you would pass both:
# num_genes should then include all cells from both sets
#cicero_gene_activities <- normalize_gene_activities(list(unnorm_ga, unnorm_ga2), num_genes)
meta=read.table('State1_metadata.csv',header=TRUE,sep='\t',row.names=1)
pData(input_cds)$Celltype <- meta[row.names(pData(input_cds)),]
#pData(input_cds)$cell <- NULL
input_cds@phenoData$Celltype
input_cds
row.names(pData(input_cds))
agg_cds <- aggregate_nearby_peaks(input_cds, distance = 10000)
agg_cds <- detectGenes(agg_cds)
agg_cds <- estimateSizeFactors(agg_cds)
agg_cds <- estimateDispersions(agg_cds)
diff_timepoint <- differentialGeneTest(agg_cds,fullModelFormulaStr="~num_genes_expressed", cores=10)
plot_pc_variance_explained(agg_cds, return_all = F)
agg_cds <- reduceDimension(agg_cds, max_components = 2, 
                              norm_method = 'log',
                              num_dim = 4,
                              reduction_method = 'tSNE',
                              verbose = T,perplexity=7)
agg_cds <- clusterCells(agg_cds, verbose = T)
plot_cell_clusters(agg_cds, color_by = 'as.factor(Cluster)')
plot_cell_clusters(agg_cds, color_by = 'as.factor(Celltype)')
clustering_DA_sites <- differentialGeneTest(agg_cds, fullModelFormulaStr = '~Cluster')
clustering_DA_sites
head(clustering_DA_sites)
agg_cds
agg_cds$Clusters
agg_cds$clusters
pData(agg_cds)
head(pData(agg_cds))
aggs_cds$Cluster
agg_cds$Cluster
Cluster1 <- agg_cds[agg_cds$Cluster %in% 1,]
Cluster1
agg_cds@phenoData
agg_cds@phenoData$sampleNames
agg_cds@phenoData$Cluster
agg_cds@phenoData$Cluster %in% 1
Cluster1 <- agg_cds[agg_cds@phenoData$Cluster %in% 1,]
Cluster1
Cluster1 <- agg_cds[,1]
agg_cds
input_cds_lin1 <- input_cds[,row.names(subset(pData(input_cds), cluster= 1))]
head(input_cds_lin1)
input_cds_lin2 <- input_cds[,row.names(subset(pData(input_cds), cluster= 2))]
pData(input_cds_lin1)
input_cds_lin1 <- input_cds[,row.names(subset(pData(input_cds), Cluster= 1))]
input_cds_lin1
input_cds_lin1$Cluster
input_cds$Cluster
pData(input_cds)
head(pData(input_cds))
agg_cds@phenoData$Cluster
agg_cds@phenoData
input_cds_lin1 <- input_cds[,row.names(subset(agg_cds@phenoData, Cluster= 1))]
agg_cds$Cluster
agg_cds_1 <- agg_cds[,row.names(subset(pData(agg_cds), Cluster= 1))]
agg_cds_1
row.names((pData(agg_cds))
)
pData(agg_cds)
head(pData(agg_cds))
agg_cds_1 <- agg_cds[,pData(agg_cds), Cluster= 1))]
agg_cds_1 <- agg_cds[,pData(agg_cds), Cluster= 1]
row.names(subset(pData(agg_cds), Cluster= 1))
row.names(subset(agg_cds$Cluster = 1))
row.names(subset(agg_cds$Cluster == 1))
row.names(agg_cds[agg_cds$Cluster == 1,])
col.names(agg_cds[agg_cds$Cluster == 1,])
colnames(agg_cds[agg_cds$Cluster == 1,])
head(pData(agg_cds))
agg_cds_1 <- agg_cds[,row.names(subset(pData(agg_cds), Cluster  == 1))]
agg_cds_1
agg_cds_2 <- agg_cds[,row.names(subset(pData(agg_cds), Cluster  == 2))]
coons
conns
plot_connections(conns, 
                 alpha_by_coaccess = FALSE, 
                 "chr7", 16081473, 16889365, 
                 gene_model = gene_annotation_sample, 
                 coaccess_cutoff = 0.1, 
                 connection_width = .5, 
                 collapseTranscripts = "longest" )
data(gene_annotation_sample)
gene_annotation_sample
head(gene_annotation_sample)
head(mm9_Genes)
head(genes)
plot_connections(conns, 
                 alpha_by_coaccess = FALSE, 
                 "chr7", 16081473, 16889365, 
                 gene_model = genes, 
                 coaccess_cutoff = 0.1, 
                 connection_width = .5, 
                 collapseTranscripts = "longest" )
plot_connections(conns, 
                 alpha_by_coaccess = FALSE, 
                 "chr7", 16481473, 16489365, 
                 gene_model = genes, 
                 coaccess_cutoff = 0.1, 
                 connection_width = .5, 
                 collapseTranscripts = "longest" )
head(conns)
cicero_cds_1 <- make_cicero_cds(agg_cds_1)
conns_1 <- run_cicero(cicero_cds_1, mm9)
CCAN_assigns_1 <- generate_ccans(conns_1)
cicero_cds_2 <- make_cicero_cds(agg_cds_2)
conns_2 <- run_cicero(cicero_cds_2, mm9)
tsne_coords
slot(agg_cds_1)
slotNames(agg_cds_1)
slotNames(agg_cds_1@reducedDimS)
names(agg_cds_1@reducedDimS)
slotNames(agg_cds_1@reducedDimA)
slotNames(agg_cds_1@reducedDimK)
names(agg_cds_1@reducedDimK)
names(agg_cds_1@reducedDimA)
agg_cds_1@reducedDimK
agg_cds_1@reducedDimA
colnames(aggs_cds_1)
colnames(agg_cds_1)
try <- input_cds[,colnames(agg_cds1)
]
try <- input_cds[,colnames(agg_cds_1)]
try
try$Size_Factor
input_cds_1 <- input_cds[,colnames(agg_cds_1)]
input_cds_1 <- reduceDimension(input_cds_1, max_components = 2, num_dim=6, reduction_method = 'tSNE', norm_method = "none")
tsne_coords_1 <- t(reducedDimA(input_cds_1))
row.names(tsne_coords_1) <- row.names(pData(input_cds_1))
input_cds_2 <- input_cds[,colnames(agg_cds_2)]
input_cds_2 <- reduceDimension(input_cds_2, max_components = 2, num_dim=6, reduction_method = 'tSNE', norm_method = "none")
tsne_coords_2 <- t(reducedDimA(input_cds_2))
row.names(tsne_coords_2) <- row.names(pData(input_cds_2))
cicero_cds_1 <- make_cicero_cds(input_cds_1, reduced_coordinates = tsne_coords_1)
conns_1 <- run_cicero(cicero_cds_1, mm9)
CCAN_assigns_1 <- generate_ccans(conns_1)
cicero_cds_2 <- make_cicero_cds(input_cds_2, reduced_coordinates = tsne_coords_2)
conns_2 <- run_cicero(cicero_cds_2, mm9)
input_cds1
input_cds_1
rownames(input_cds_1)
input_cds_1
plot_connections(conns, 
                 alpha_by_coaccess = FALSE, 
                 "chr10", 77412587, 77626360, 
                 gene_model = genes, 
                 coaccess_cutoff = 0.1, 
                 connection_width = .5, 
                 collapseTranscripts = "longest" )
q()
